library(ggplot2)
library(tidyverse)
diamonds %>% count(cut) # count
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
ggplot(diamonds) + geom_bar(aes(x = cut))
diamonds %>%
count(cut_width(carat, .5))
## # A tibble: 11 × 2
## `cut_width(carat, 0.5)` n
## <fctr> <int>
## 1 [-0.25,0.25] 785
## 2 (0.25,0.75] 29498
## 3 (0.75,1.25] 15977
## 4 (1.25,1.75] 5313
## 5 (1.75,2.25] 2002
## 6 (2.25,2.75] 322
## 7 (2.75,3.25] 32
## 8 (3.25,3.75] 5
## 9 (3.75,4.25] 4
## 10 (4.25,4.75] 1
## 11 (4.75,5.25] 1
ggplot(diamonds) + geom_histogram(aes(x = carat), binwidth = .5)
diamonds %>%
filter(carat < 3) %>%
ggplot() +
geom_histogram(aes(x = carat), binwidth = .01)
*the plot shows that diamonds whose carat is immidiately below 1 or 2 is very small. this implies that integer is important cutpoints for diamond carat
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50)) # zoom in y 0:50
diamonds %>%
filter(y < 3 | y > 20) %>%
dplyr::select(price, x, y, z) %>%
arrange(y)
## # A tibble: 9 × 4
## price x y z
## <int> <dbl> <dbl> <dbl>
## 1 5139 0.00 0.0 0.00
## 2 6381 0.00 0.0 0.00
## 3 12800 0.00 0.0 0.00
## 4 15686 0.00 0.0 0.00
## 5 18034 0.00 0.0 0.00
## 6 2130 0.00 0.0 0.00
## 7 2130 0.00 0.0 0.00
## 8 2075 5.15 31.8 5.12
## 9 12210 8.09 58.9 8.06
ggplot(diamonds, aes(x = x)) +
geom_histogram(binwidth = .05)
# price around 1500 does not exist
diamonds %>%
filter(carat == .99)
## # A tibble: 23 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.99 Fair I SI2 68.1 56 2811 6.21 6.06 4.18
## 2 0.99 Fair J SI1 55.0 61 2812 6.72 6.67 3.68
## 3 0.99 Fair J SI1 58.0 67 2949 6.57 6.50 3.79
## 4 0.99 Fair I SI1 60.7 66 3337 6.42 6.34 3.87
## 5 0.99 Fair H VS2 71.6 57 3593 5.94 5.80 4.20
## 6 0.99 Very Good J SI1 60.3 57 4002 6.44 6.49 3.90
## 7 0.99 Good F SI2 63.3 54 4052 6.36 6.43 4.05
## 8 0.99 Premium F SI2 60.6 61 4075 6.45 6.38 3.89
## 9 0.99 Ideal I SI1 61.8 57 4763 6.40 6.42 3.96
## 10 0.99 Very Good E SI2 61.8 59 4780 6.30 6.33 3.90
## # ... with 13 more rows
diamonds %>%
filter(carat == 1)
## # A tibble: 1,558 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1 Premium I SI2 58.2 60 2795 6.61 6.55 3.83
## 2 1 Premium J SI2 62.3 58 2801 6.45 6.34 3.98
## 3 1 Fair G I1 66.4 59 2808 6.16 6.09 4.07
## 4 1 Fair J VS2 65.7 59 2811 6.14 6.07 4.01
## 5 1 Premium H I1 61.3 60 2818 6.43 6.39 3.93
## 6 1 Fair H SI2 65.3 62 2818 6.34 6.12 4.08
## 7 1 Premium F I1 58.9 60 2841 6.60 6.55 3.87
## 8 1 Fair G SI2 67.8 61 2856 5.96 5.90 4.02
## 9 1 Fair I SI1 67.9 62 2856 6.19 6.03 4.15
## 10 1 Fair H SI2 66.1 56 2856 6.21 5.97 4.04
## # ... with 1,548 more rows
diamonds %>%
mutate(id = row_number()) %>%
dplyr::select(x, y, z, id) %>%
gather(variable, value, -id) # melt with `x,y,z` without `id` while remaining it as `id`
## # A tibble: 161,820 × 3
## id variable value
## <int> <chr> <dbl>
## 1 1 x 3.95
## 2 2 x 3.89
## 3 3 x 4.05
## 4 4 x 4.20
## 5 5 x 4.34
## 6 6 x 3.94
## 7 7 x 3.95
## 8 8 x 4.07
## 9 9 x 3.87
## 10 10 x 4.00
## # ... with 161,810 more rows
# the first argument(`variable`) take the colnames and the second argument (`value`) takes the values of the argument for each row
diamonds %>%
mutate(id = row_number()) %>%
dplyr::select(x, y, z, id) %>%
gather(variable, value, -id) %>%
ggplot(aes(x = value)) + geom_density() + facet_grid(variable ~ .) + geom_rug()
geom_rug() gives you the each case, marginal distibutionsfacet_grid display for each variable, you can use facet_grid(~variable) instead tooggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds) +
geom_histogram(mapping = aes(x = price)) +
xlim(100, 5000) + ylim(0, 3000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_bar).
coord_cartesian simply zoom while xlim and ylim drop the values exceed the given limits and draw the graph
# return `NA` if the value matches the logical vector in the first argument
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point() # or alternatively geom_point(na.rm = T)
## Warning: Removed 9 rows containing missing values (geom_point).
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %/% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_freqpoly(mapping = aes(color = cancelled), binwidth = 1/4)
geom_bar), NA is regarded as a new category.# price毎にdensity plotを作成する
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
# cut毎に、priceの位置をboxplotした
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time)
) %>%
ggplot(mapping = aes(x = cancelled, y = sched_dep_time)) + geom_boxplot()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
# the two codes below provide same results
diamonds %>%
count(color, cut)
## Source: local data frame [35 x 3]
## Groups: color [?]
##
## color cut n
## <ord> <ord> <int>
## 1 D Fair 163
## 2 D Good 662
## 3 D Very Good 1513
## 4 D Premium 1603
## 5 D Ideal 2834
## 6 E Fair 224
## 7 E Good 933
## 8 E Very Good 2400
## 9 E Premium 2337
## 10 E Ideal 3903
## # ... with 25 more rows
diamonds %>%
group_by(color, cut) %>%
dplyr::summarise(n = n())
## Source: local data frame [35 x 3]
## Groups: color [?]
##
## color cut n
## <ord> <ord> <int>
## 1 D Fair 163
## 2 D Good 662
## 3 D Very Good 1513
## 4 D Premium 1603
## 5 D Ideal 2834
## 6 E Fair 224
## 7 E Good 933
## 8 E Very Good 2400
## 9 E Premium 2337
## 10 E Ideal 3903
## # ... with 25 more rows
nycflights13::flights %>%
group_by(month, dest) %>%
dplyr::summarise(dep_delay = mean(dep_delay, na.rm = T)) %>%
group_by(dest) %>% # この前でungroup()してもいい
filter(n() == 12) # 12のフライトがあるdestのみを抽出 = 毎月フライトがあるdestを抽出。上記でgroup_by
## Source: local data frame [1,008 x 3]
## Groups: dest [84]
##
## month dest dep_delay
## <int> <chr> <dbl>
## 1 1 ALB 41.396825
## 2 1 ATL 4.471918
## 3 1 AUS 10.301775
## 4 1 BDL 21.108108
## 5 1 BHM 30.347826
## 6 1 BNA 13.227390
## 7 1 BOS 5.235004
## 8 1 BQN 5.870968
## 9 1 BTV 11.702703
## 10 1 BUF 13.796163
## # ... with 998 more rows
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = .01)
# one option
# cut_width determine the cut points for changing color
# in this example by 0.3 each, [0.45, 0.75], [0.75, 1.05]
ggplot(data = diamonds) +
geom_freqpoly(aes(x = price, color = cut_width(carat, .3)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = diamonds) +
geom_freqpoly(aes(x = price, y = ..density.., color = cut_width(carat, .3)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# another option
# cut_number determine the cut points based on the number of observation in each cluster
# in this case, each cluster of the size (carat) has 10 observations
ggplot(data = diamonds) +
geom_freqpoly(aes(x = price, color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = diamonds) +
geom_freqpoly(aes(x = price, y = ..density.., color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# `x` of boxplot control the number to be plot
# `y` of boxplot control the partion
# weird
ggplot(data = diamonds) +
geom_boxplot(aes(x = carat, y = cut_number(price, 10)))
## Warning: position_dodge requires non-overlapping x intervals
# good
ggplot(data = diamonds) +
geom_boxplot(aes(y = carat, x = cut_number(price, 10))) +
coord_flip()
# hexagonal heatmap
ggplot(diamonds, aes(x = carat, y = price)) +
geom_hex() +
facet_wrap(~ cut, ncol = 1)
library(modelr)
mod <- lm(log(price) ~ log(carat), diamonds)
diamonds3 <- diamonds %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(diamonds3) +
geom_point(mapping = aes(x = carat, y = resid))
cut.ggplot(diamonds3) +
geom_boxplot(aes(x = cut, y = resid))
ggplot(diamonds3) +
geom_boxplot(aes(x = cut, y = carat))
ggplot(diamonds3) +
geom_boxplot(aes(x = cut, y = price))